The risk manager from a consultancy wants to analyse Workers Compensation risk in the United States. He has hired me as a consultant to identify the key factors of high cost workers compensation claim across the US. He provides historical loss experience data.
There are 3 objectives to be met in this document.
Explore the claims experience (using graphs/plots), Build a model to predict high cost workers compnesation claim and highlight the key drivers of high cost claims.
Prove that litigation is the key driver of high cost claims using statistical analysis and tests. Predict if the case will go for litigation. Hence drive the objective 1.
Use NLP techniques to engineer features from description (stated in hint below)
Let us start with objective 1
library(tidyverse)
setwd("D:\\DS\\JLT")
wb2<- "WC_Data_Science Case Study.csv"
compensations <- read.csv(wb2)
compensations_bkp <- read.csv(wb2)
head(compensations)
## Unique_ID Case.Number Date.of.Birth Accident.State Loss.Type
## 1 1 E8P0048 IN Indemnity
## 2 2 E8P0649 IN Indemnity
## 3 3 E9S0726 IN Medical Only
## 4 4 E8P4445 IN Medical Only
## 5 5 E7D4228 IN Indemnity
## 6 6 E8P0662 IN Medical Only
## Status_Updated Loss.Date Closed.Date Report.Date
## 1 Closed 28-09-16 30-11-16 03-10-16
## 2 Open 30-09-16 11-10-16
## 3 Closed 24-01-17 06-01-18 02-03-17
## 4 Closed 29-11-16 21-02-17 02-12-16
## 5 Closed 08-07-16 18-02-17 22-07-16
## 6 Closed 04-10-16 04-03-17 11-10-16
## Cause.Description
## 1 THE IW REPORTED HE WAS WALKING BACK TO FORK LIFT AND FELT A POP IN RIGHT KNEE AND FELL TO FLOOR.
## 2 IE WAS GETTING OFF A FORKLIFT, FOOT SLIPPED AND HE FELT PAIN IN LOWER BACK
## 3 IW WAS WALKING AROUND A SHEETING MACHINE, WHEN SHE HEARD A POP IN LFT ANKLE AND FELT PAIN IN THE UPPER LFT LEG, DIAG: LFT ANKLE STRAIN PER DR.
## 4 IW WAS HANDLING MATERIAL, CONFIRMING ORDER, IW TURNED & HIT HIS RIGHT ELBOW ON A DESK
## 5 IW GOT FIBER GLASS DEBRIS IN EYE
## 6 THE IW WAS PICKING UP A SHEET OF FOAM GLASS WHEN HE FELT A PAIN IN HIS BACK.
## Litigation Occupation Carrier Sector.Industry Claim.Cost High.Cost
## 1 NO Carrier 1 Industrials 13,492 0
## 2 NO Carrier 1 Industrials 400,922 1
## 3 NO Carrier 1 Industrials 4,887 0
## 4 NO Carrier 1 Industrials 2,507 0
## 5 NO Carrier 1 Industrials 2,916 0
## 6 NO Carrier 1 Industrials 4,899 0
#1. Missing values
table(is.na(compensations$Date.of.Birth))
##
## FALSE
## 57749
# Blank values instead of NA so reload with NA, Commas and hyphen in claim amount
# Now read with NA
compensationsWithNA <- read.csv(wb2,header = T, na.strings = c("","NA"))
# Remove the commans and hyphen from claim amount
compensationsWithNA$Claim.Cost <- as.numeric(gsub("[,-]","",compensationsWithNA$Claim.Cost))
# Backup for compensations with NA
compensationsWithNA_bkp <- compensationsWithNA
head(compensationsWithNA)
## Unique_ID Case.Number Date.of.Birth Accident.State Loss.Type
## 1 1 E8P0048 <NA> IN Indemnity
## 2 2 E8P0649 <NA> IN Indemnity
## 3 3 E9S0726 <NA> IN Medical Only
## 4 4 E8P4445 <NA> IN Medical Only
## 5 5 E7D4228 <NA> IN Indemnity
## 6 6 E8P0662 <NA> IN Medical Only
## Status_Updated Loss.Date Closed.Date Report.Date
## 1 Closed 28-09-16 30-11-16 03-10-16
## 2 Open 30-09-16 <NA> 11-10-16
## 3 Closed 24-01-17 06-01-18 02-03-17
## 4 Closed 29-11-16 21-02-17 02-12-16
## 5 Closed 08-07-16 18-02-17 22-07-16
## 6 Closed 04-10-16 04-03-17 11-10-16
## Cause.Description
## 1 THE IW REPORTED HE WAS WALKING BACK TO FORK LIFT AND FELT A POP IN RIGHT KNEE AND FELL TO FLOOR.
## 2 IE WAS GETTING OFF A FORKLIFT, FOOT SLIPPED AND HE FELT PAIN IN LOWER BACK
## 3 IW WAS WALKING AROUND A SHEETING MACHINE, WHEN SHE HEARD A POP IN LFT ANKLE AND FELT PAIN IN THE UPPER LFT LEG, DIAG: LFT ANKLE STRAIN PER DR.
## 4 IW WAS HANDLING MATERIAL, CONFIRMING ORDER, IW TURNED & HIT HIS RIGHT ELBOW ON A DESK
## 5 IW GOT FIBER GLASS DEBRIS IN EYE
## 6 THE IW WAS PICKING UP A SHEET OF FOAM GLASS WHEN HE FELT A PAIN IN HIS BACK.
## Litigation Occupation Carrier Sector.Industry Claim.Cost High.Cost
## 1 NO <NA> Carrier 1 Industrials 13492 0
## 2 NO <NA> Carrier 1 Industrials 400922 1
## 3 NO <NA> Carrier 1 Industrials 4887 0
## 4 NO <NA> Carrier 1 Industrials 2507 0
## 5 NO <NA> Carrier 1 Industrials 2916 0
## 6 NO <NA> Carrier 1 Industrials 4899 0
# Now the values with NA and no hyphen or comma in cost
table(is.na(compensationsWithNA$Date.of.Birth))
##
## FALSE TRUE
## 14272 43477
na_count <-sapply(compensationsWithNA, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count
## na_count
## Unique_ID 9
## Case.Number 9
## Date.of.Birth 43477
## Accident.State 9026
## Loss.Type 9
## Status_Updated 9
## Loss.Date 9
## Closed.Date 15779
## Report.Date 6895
## Cause.Description 946
## Litigation 16204
## Occupation 41426
## Carrier 49945
## Sector.Industry 9
## Claim.Cost 8120
## High.Cost 9
# While I check the data frame na_count then it is noticed that some of the values have exactly 9 NA's which is bit odd
# so I checked the lower and upper values
head(compensationsWithNA)
## Unique_ID Case.Number Date.of.Birth Accident.State Loss.Type
## 1 1 E8P0048 <NA> IN Indemnity
## 2 2 E8P0649 <NA> IN Indemnity
## 3 3 E9S0726 <NA> IN Medical Only
## 4 4 E8P4445 <NA> IN Medical Only
## 5 5 E7D4228 <NA> IN Indemnity
## 6 6 E8P0662 <NA> IN Medical Only
## Status_Updated Loss.Date Closed.Date Report.Date
## 1 Closed 28-09-16 30-11-16 03-10-16
## 2 Open 30-09-16 <NA> 11-10-16
## 3 Closed 24-01-17 06-01-18 02-03-17
## 4 Closed 29-11-16 21-02-17 02-12-16
## 5 Closed 08-07-16 18-02-17 22-07-16
## 6 Closed 04-10-16 04-03-17 11-10-16
## Cause.Description
## 1 THE IW REPORTED HE WAS WALKING BACK TO FORK LIFT AND FELT A POP IN RIGHT KNEE AND FELL TO FLOOR.
## 2 IE WAS GETTING OFF A FORKLIFT, FOOT SLIPPED AND HE FELT PAIN IN LOWER BACK
## 3 IW WAS WALKING AROUND A SHEETING MACHINE, WHEN SHE HEARD A POP IN LFT ANKLE AND FELT PAIN IN THE UPPER LFT LEG, DIAG: LFT ANKLE STRAIN PER DR.
## 4 IW WAS HANDLING MATERIAL, CONFIRMING ORDER, IW TURNED & HIT HIS RIGHT ELBOW ON A DESK
## 5 IW GOT FIBER GLASS DEBRIS IN EYE
## 6 THE IW WAS PICKING UP A SHEET OF FOAM GLASS WHEN HE FELT A PAIN IN HIS BACK.
## Litigation Occupation Carrier Sector.Industry Claim.Cost High.Cost
## 1 NO <NA> Carrier 1 Industrials 13492 0
## 2 NO <NA> Carrier 1 Industrials 400922 1
## 3 NO <NA> Carrier 1 Industrials 4887 0
## 4 NO <NA> Carrier 1 Industrials 2507 0
## 5 NO <NA> Carrier 1 Industrials 2916 0
## 6 NO <NA> Carrier 1 Industrials 4899 0
tail(compensationsWithNA)
## Unique_ID Case.Number Date.of.Birth Accident.State Loss.Type
## 57744 NA <NA> <NA> <NA> <NA>
## 57745 NA <NA> <NA> <NA> <NA>
## 57746 NA <NA> <NA> <NA> <NA>
## 57747 NA <NA> <NA> <NA> <NA>
## 57748 NA <NA> <NA> <NA> <NA>
## 57749 NA <NA> <NA> <NA> <NA>
## Status_Updated Loss.Date Closed.Date Report.Date Cause.Description
## 57744 <NA> <NA> <NA> <NA> <NA>
## 57745 <NA> <NA> <NA> <NA> <NA>
## 57746 <NA> <NA> <NA> <NA> <NA>
## 57747 <NA> <NA> <NA> <NA> <NA>
## 57748 <NA> <NA> <NA> <NA> <NA>
## 57749 <NA> <NA> <NA> <NA> <NA>
## Litigation Occupation Carrier Sector.Industry Claim.Cost High.Cost
## 57744 <NA> <NA> <NA> <NA> NA NA
## 57745 <NA> <NA> <NA> <NA> NA NA
## 57746 <NA> <NA> <NA> <NA> NA NA
## 57747 <NA> <NA> <NA> <NA> NA NA
## 57748 <NA> <NA> <NA> <NA> NA NA
## 57749 <NA> <NA> <NA> <NA> NA NA
# So it seems like the ending values are nothing just NAs so let's remove last 9 records from table
n<-dim(compensationsWithNA)[1]
compensationsWithNA<-compensationsWithNA[1:(n-9),]
# Recheck the na's
na_count <-sapply(compensationsWithNA, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count$percent <- (na_count$na_count/n)*100
na_count
## na_count percent
## Unique_ID 0 0.000000
## Case.Number 0 0.000000
## Date.of.Birth 43468 75.270567
## Accident.State 9017 15.614123
## Loss.Type 0 0.000000
## Status_Updated 0 0.000000
## Loss.Date 0 0.000000
## Closed.Date 15770 27.307832
## Report.Date 6886 11.924016
## Cause.Description 937 1.622539
## Litigation 16195 28.043776
## Occupation 41417 71.718991
## Carrier 49936 86.470761
## Sector.Industry 0 0.000000
## Claim.Cost 8111 14.045265
## High.Cost 0 0.000000
# NA can be removed by replacing with following values:
# 1. Mean
# 2. Median
# 3. Most probably Bayesian value
# 4. k-nearest neighbour
table(is.na(compensationsWithNA$Claim.Cost),compensationsWithNA$High.Cost)
##
## 0 1
## FALSE 43812 5817
## TRUE 8111 0
# The NA's in claim cost are all lower in value as the high cost for them is false
mean(compensationsWithNA$Claim.Cost, na.rm = T)
## [1] 13551.76
# Let's replace all the cost values with mean
compensationsWithNA$Claim.Cost[is.na(compensationsWithNA$Claim.Cost)] <- mean(compensationsWithNA$Claim.Cost, na.rm = T)
# Backup of data till now
compensationsWithNA_bkp2 <- compensationsWithNA
#compensationsWithNA <- compensationsWithNA_bkp2
unique(compensationsWithNA$Litigation)
## [1] NO YES <NA> No Yes
## Levels: No NO Yes YES
# The values in variable Litigatation are No, NO, Yes, YES
compensationsWithNA$Litigation <- toupper(compensationsWithNA$Litigation)
library(ggplot2)
library(plotly)
gg <- ggplot(data=compensations) +
geom_bar(mapping =aes(x=compensations$Carrier),
position = "dodge",width = 10)
ggplotly(gg)
## Warning: position_dodge requires non-overlapping x intervals
gg <- ggplot(data=compensationsWithNA) +
geom_bar(mapping =aes(x=compensationsWithNA$Litigation, fill=compensationsWithNA$Sector.Industry),
position = "dodge")
ggplotly(gg)
# The graph says that for Communication services, Aviation and Consumer disc, the litigation is blank always
# but considering the % of NO, it can be filled as NO for all.
table(compensationsWithNA$Litigation)
##
## NO YES
## 39247 2298
table(compensationsWithNA$Litigation,compensationsWithNA$High.Cost)
##
## 0 1
## NO 36719 2528
## YES 672 1626
gg <- ggplot(data=compensationsWithNA) +
geom_bar(mapping =aes(x=compensationsWithNA$High.Cost, fill=compensationsWithNA$Litigation),
position = "dodge")
ggplotly(gg)
# Let's replace all missing values in litigation as no
compensationsWithNA$Litigation[is.na(compensationsWithNA$Litigation)] <- 'NO'
# Carrier has 86% NA's which we can ignore
# Occupation has 71% NA's which we can ignore
# DOB has 75% NA's but can be ignored
# Closed date can be empty as ticket status can still be open
# Accident state has 15% NA but as of now we can ignore
# Let's check for outliers in claim cost
gg <- ggplot(data = compensationsWithNA, aes(x = compensationsWithNA$Sector.Industry, y = Claim.Cost)) +
geom_boxplot() +
coord_cartesian(ylim = c(min(compensationsWithNA$Claim.Cost),max(compensationsWithNA$Claim.Cost)))
ggplotly(gg)
# There can be outliers but the max value 1.2 million can be reached for the compensation claim but to consider it as outlier more information will be required.
# Moreover, this is bit tricky as for higher values there were no litigations
# As of now we ignore the outliers
# As there are not many variables then we don't need to do PCA or SVD for dimensionality reduction
# Here we have more categorical variables and less continuous variables, so we do the
# bivariate analysis using two way table
# 1. Litigation vs High Cost
gg <- ggplot(data=compensationsWithNA) +
geom_bar(mapping =aes(x=compensationsWithNA$High.Cost, fill=compensationsWithNA$Litigation),
position = "dodge")
ggplotly(gg)
prop.test(table(compensationsWithNA$Litigation, compensationsWithNA$High.Cost))
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: table(compensationsWithNA$Litigation, compensationsWithNA$High.Cost)
## X-squared = 9720.8, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.6130249 0.6509337
## sample estimates:
## prop 1 prop 2
## 0.9244075 0.2924282
# 2. Loss Type vs High Cost
gg <- ggplot(data=compensationsWithNA) +
geom_bar(mapping =aes(x=compensationsWithNA$High.Cost, fill=compensationsWithNA$Loss.Type),
position = "dodge")
ggplotly(gg)
prop.test(table(compensationsWithNA$Loss.Type, compensationsWithNA$High.Cost))
##
## 4-sample test for equality of proportions without continuity
## correction
##
## data: table(compensationsWithNA$Loss.Type, compensationsWithNA$High.Cost)
## X-squared = 4659.2, df = 3, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4
## 0.7738857 0.9455967 0.9997187 0.9240506
# 3. Accident State vs High Cost
gg <- ggplot(data=compensationsWithNA) +
geom_bar(mapping =aes(x=compensationsWithNA$High.Cost, fill=compensationsWithNA$Accident.State),
position = "dodge")
ggplotly(gg)
prop.test(table(compensationsWithNA$Accident.State, compensationsWithNA$High.Cost))
## Warning in prop.test(table(compensationsWithNA$Accident.State,
## compensationsWithNA$High.Cost)): Chi-squared approximation may be incorrect
##
## 52-sample test for equality of proportions without continuity
## correction
##
## data: table(compensationsWithNA$Accident.State, compensationsWithNA$High.Cost)
## X-squared = 677.38, df = 51, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4 prop 5 prop 6 prop 7
## 0.8333333 0.8463542 0.8759894 0.9405321 1.0000000 0.8719719 0.8862309
## prop 8 prop 9 prop 10 prop 11 prop 12 prop 13 prop 14
## 0.9145497 0.9806094 0.9791667 0.9207746 1.0000000 0.8656126 1.0000000
## prop 15 prop 16 prop 17 prop 18 prop 19 prop 20 prop 21
## 0.9017341 0.8900524 0.8846154 0.9451888 0.9487179 0.9131868 0.8538622
## prop 22 prop 23 prop 24 prop 25 prop 26 prop 27 prop 28
## 0.9427883 0.9267016 0.9506726 0.9560284 0.9214537 0.8831361 0.8310502
## prop 29 prop 30 prop 31 prop 32 prop 33 prop 34 prop 35
## 0.9148936 0.9010417 0.9310345 0.9311741 0.8697730 0.9275362 0.9534110
## prop 36 prop 37 prop 38 prop 39 prop 40 prop 41 prop 42
## 0.9345212 1.0000000 1.0000000 0.9394513 0.8723958 0.8063337 0.9049904
## prop 43 prop 44 prop 45 prop 46 prop 47 prop 48 prop 49
## 0.9195402 0.8375527 0.8995984 0.9273084 0.9293413 0.9223803 0.9270833
## prop 50 prop 51 prop 52
## 0.6250000 0.8720430 0.8928571
# 4. Sectory/Industry vs High Cost
gg <- ggplot(data=compensationsWithNA) +
geom_bar(mapping =aes(x=compensationsWithNA$High.Cost, fill=compensationsWithNA$Sector.Industry),
position = "dodge")
ggplotly(gg)
prop.test(table(compensationsWithNA$Sector.Industry, compensationsWithNA$High.Cost))
##
## 8-sample test for equality of proportions without continuity
## correction
##
## data: table(compensationsWithNA$Sector.Industry, compensationsWithNA$High.Cost)
## X-squared = 750.01, df = 7, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4 prop 5 prop 6 prop 7
## 0.9087879 0.9287411 0.9058613 0.9397590 0.9167329 0.9040373 0.8923077
## prop 8
## 0.7956428
# create backup again
compensationsWithNA_bkp3 <- compensationsWithNA
# compensationsWithNA_bkp3 -> compensationsWithNA
compensationsWithNA <- compensationsWithNA[,c(2,5,11,14,16)]
# sample the data for modelling
# 75% of the sample size
smp_size <- floor(0.75 * nrow(compensationsWithNA))
# set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(compensationsWithNA)), size = smp_size)
train <- compensationsWithNA[train_ind, ]
test <- compensationsWithNA[-train_ind, ]
# Check if the train and test sets are fine
prop.table(table(train$High.Cost))
##
## 0 1
## 0.8990186 0.1009814
prop.table(table(test$High.Cost))
##
## 0 1
## 0.8999654 0.1000346
# It looks fine
library(plyr)
library(rpart.plot)
library(e1071)
library(caret)
library(randomForest)
# Let's create a model now
# 1. Decision based tree
fit <- rpart(train$High.Cost~., data = train, method = "class")
# rpart.plot(fit, extra = 3)
PRE_TDT=predict(fit,data=train,type="class")
confusionMatrix(PRE_TDT,as.factor(train$High.Cost))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38930 319
## 1 2 4054
##
## Accuracy : 0.9926
## 95% CI : (0.9917, 0.9934)
## No Information Rate : 0.899
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9578
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9999
## Specificity : 0.9271
## Pos Pred Value : 0.9919
## Neg Pred Value : 0.9995
## Prevalence : 0.8990
## Detection Rate : 0.8990
## Detection Prevalence : 0.9063
## Balanced Accuracy : 0.9635
##
## 'Positive' Class : 0
##
# Accuracy is 99.26 percent
# model fit
# note that you must turn the ordinal variables into factor or R wont use
# them properly
model <- randomForest(y=train$High.Cost,x=cbind(as.factor(train$Loss.Type),as.factor(train$Litigation),as.factor(train$Sector.Industry)),ntree=100)
## Warning in randomForest.default(y = train$High.Cost, x =
## cbind(as.factor(train$Loss.Type), : The response has five or fewer unique
## values. Are you sure you want to do regression?
#plot of model accuracy by class
plot(model)
# Other models can also be created#
# Radial Support vector Machine
# lasso-ridge regression
# Linear Support vector Machine
# Logistic Regression
# It looks like the key drivers from the analysis as of now are sectory/industry, litigation and loss type.
# Moreover the DOB can also contribute by calculating the age of the worker.
# Claim cost is removed because it is certain that if the claim cost is > 20k then automatically high cost is true.
# Another analysis could be by calculating the claim cost and then further see if the cost is >20k or <20k
# Area wise analysis can also be done.
# As the accident state has postal codes of US, I checked for the list of US postal codes
USStateCodes <- read.csv("US State Codes.csv")
view(USStateCodes)
Now let’s start with Objective 2. Here we have to Prove that litigation is the key driver of high cost claims using statistical analysis and tests. Predict if the case will go for litigation. Hence drive the objective 1.
library(plotROC)
library(InformationValue)
library(ROCR)
library(ggpubr)
# OBJECTIVE 2
# ------------
# Null Hypothesis - If Litigation is YES, High Cost is 1
# Let's start by taking the new data and removing all the missing Litigations
objective2 <- read.csv(wb2, na.strings = c("","NA"))
objective2_bkp <- read.csv(wb2, na.strings = c("","NA"))
objective2 <- objective2[,c('Litigation','High.Cost')]
unique(objective2$Litigation)
## [1] NO YES <NA> No Yes
## Levels: No NO Yes YES
# The values in variable Litigatation are No, NO, Yes, YES
objective2$Litigation <- toupper(objective2$Litigation)
gg <- ggplot(data=objective2) +
geom_bar(mapping =aes(x=objective2$Litigation, fill=objective2$High.Cost),
position = "dodge")
ggplotly(gg)
# Remove all the null rows from dataset
objective2 <- na.omit(objective2)
# Replace YES With 1 and NO with 0
objective2 <- objective2 %>%
mutate(Litigation = ifelse(Litigation == "NO",0,1))
head(objective2)
## Litigation High.Cost
## 1 0 0
## 2 0 1
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
cor.test(objective2$Litigation, objective2$High.Cost, method=c("pearson", "kendall", "spearman"))
##
## Pearson's product-moment correlation
##
## data: objective2$Litigation and objective2$High.Cost
## t = 114.6, df = 41543, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4827538 0.4973666
## sample estimates:
## cor
## 0.4900947
ggscatter(objective2, x = "Litigation", y = "High.Cost",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Litigation", ylab = "High Cost")
# The correlation between the variables comes out to be 0.49 which doesn't necessarily show that the variables are highly correlated
# Find the chi-squared value and p value to check the hypothesis
tbl <- as.matrix(table(objective2$Litigation,objective2$High.Cost))
chi2 = chisq.test(tbl, correct=F)
c(chi2$statistic, chi2$p.value)
## X-squared
## 9978.809 0.000
# here the p-value comes to be very small which makes it more prominent to reject the hypothesis
sqrt(chi2$statistic / sum(tbl))
## X-squared
## 0.4900947
# It again shows that the correlation is 0.49
# Let's try now creating the logistic regression model as we have to see the binary value.
# We will start again by creating training and testing data sets
# sample the data for modelling
# 75% of the sample size
smp_size2 <- floor(0.75 * nrow(objective2))
# set the seed to make your partition reproducible
set.seed(123)
train_ind2 <- sample(seq_len(nrow(objective2)), size = smp_size2)
train2 <- objective2[train_ind2, ]
test2 <- objective2[-train_ind2, ]
# Check if the train and test sets are fine
prop.table(table(train2$High.Cost))
##
## 0 1
## 0.8995443 0.1004557
prop.table(table(test2$High.Cost))
##
## 0 1
## 0.90141523 0.09858477
# It looks fine
# Create logistic model
# Step 1: Build Logit Model on Training Dataset
model2 <- glm (test2$High.Cost ~ ., data = test2, family = binomial)
summary(model)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 43305 -none- numeric
## mse 100 -none- numeric
## rsq 100 -none- numeric
## oob.times 43305 -none- numeric
## importance 3 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 43305 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
# Step 2: Predict Y on Test Dataset
predict <- predict(model2, type = 'response')
#confusion matrix
table(test2$High.Cost, predict > 0.5)
##
## FALSE TRUE
## 0 9197 166
## 1 636 388
# ROCR Curve
ROCRpred <- prediction(predict, test2$High.Cost)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = TRUE)
Concordance(test2$High.Cost, predict)
## $Concordance
## [1] 0.3721885
##
## $Discordance
## [1] 0.6278115
##
## $Tied
## [1] 0
##
## $Pairs
## [1] 9587712
# The concordance also comes out to be 0.37 which also proves that the null hypothesis is incorrect
plotROC(test2$High.Cost, predict)
# Area under ROC curve is also less which gain proves that null hypothesis is incorrect
# Hence the conclusion is that the null hypothesis that Litigation is the key factor in high cost claims
# surprisingly is proven wrong.
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(plyr)
library(installr)
library(RSentiment)
# OBJECTIVE 3
# ------------
# Now we have to feature new variables using the description.
# We will start by creating the term document matrix and then creating the word cloud to see
# what is the frequency of terms
objective3 <- read.csv(wb2,header = T, na.strings = c("","NA"))
objective3_bkp <- objective3
head(objective3[,'Cause.Description'])
## [1] THE IW REPORTED HE WAS WALKING BACK TO FORK LIFT AND FELT A POP IN RIGHT KNEE AND FELL TO FLOOR.
## [2] IE WAS GETTING OFF A FORKLIFT, FOOT SLIPPED AND HE FELT PAIN IN LOWER BACK
## [3] IW WAS WALKING AROUND A SHEETING MACHINE, WHEN SHE HEARD A POP IN LFT ANKLE AND FELT PAIN IN THE UPPER LFT LEG, DIAG: LFT ANKLE STRAIN PER DR.
## [4] IW WAS HANDLING MATERIAL, CONFIRMING ORDER, IW TURNED & HIT HIS RIGHT ELBOW ON A DESK
## [5] IW GOT FIBER GLASS DEBRIS IN EYE
## [6] THE IW WAS PICKING UP A SHEET OF FOAM GLASS WHEN HE FELT A PAIN IN HIS BACK.
## 41925 Levels: '2 Assist with walker and gaitbelt to stand and resident did not help to stand. Tried this x 2.' '2 Assist with walker and gaitbelt to stand and resident did not help to stand. ...
table(is.na(objective3[,'Cause.Description']))
##
## FALSE TRUE
## 56803 946
# There are 946 NA's in Cause description. We will remove all those rows as in this objective
# we are only concerned with cause description
objective3 <- objective3[,c('Cause.Description','High.Cost')]
# Remove all the null rows from dataset
objective3 <- na.omit(objective3)
head(objective3)
## Cause.Description
## 1 THE IW REPORTED HE WAS WALKING BACK TO FORK LIFT AND FELT A POP IN RIGHT KNEE AND FELL TO FLOOR.
## 2 IE WAS GETTING OFF A FORKLIFT, FOOT SLIPPED AND HE FELT PAIN IN LOWER BACK
## 3 IW WAS WALKING AROUND A SHEETING MACHINE, WHEN SHE HEARD A POP IN LFT ANKLE AND FELT PAIN IN THE UPPER LFT LEG, DIAG: LFT ANKLE STRAIN PER DR.
## 4 IW WAS HANDLING MATERIAL, CONFIRMING ORDER, IW TURNED & HIT HIS RIGHT ELBOW ON A DESK
## 5 IW GOT FIBER GLASS DEBRIS IN EYE
## 6 THE IW WAS PICKING UP A SHEET OF FOAM GLASS WHEN HE FELT A PAIN IN HIS BACK.
## High.Cost
## 1 0
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
data <- objective3[,'Cause.Description']
# Load the data as a corpus
docs <- Corpus(VectorSource(data))
inspect(docs[1:20])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 20
##
## [1] THE IW REPORTED HE WAS WALKING BACK TO FORK LIFT AND FELT A POP IN RIGHT KNEE AND FELL TO FLOOR.
## [2] IE WAS GETTING OFF A FORKLIFT, FOOT SLIPPED AND HE FELT PAIN IN LOWER BACK
## [3] IW WAS WALKING AROUND A SHEETING MACHINE, WHEN SHE HEARD A POP IN LFT ANKLE AND FELT PAIN IN THE UPPER LFT LEG, DIAG: LFT ANKLE STRAIN PER DR.
## [4] IW WAS HANDLING MATERIAL, CONFIRMING ORDER, IW TURNED & HIT HIS RIGHT ELBOW ON A DESK
## [5] IW GOT FIBER GLASS DEBRIS IN EYE
## [6] THE IW WAS PICKING UP A SHEET OF FOAM GLASS WHEN HE FELT A PAIN IN HIS BACK.
## [7] IW TRIPPED AND FELL TO GROUND DUE TO BANDING ON PALLET
## [8] IE SUFFERED A SEIZURE, COLLAPSING ON FLOOR;
## [9] IE WAS PULLING AN ORDER AND BOX FELL ONTO HIS HEAD CAUSING NECK AND BACK PAIN.
## [10] IW LOADING INSULATION BUNDLES INTO TRUCK WHEN IW FELT PAIN IN LEFT THUMB AND LEFT THUMB
## [11] CONSTANT BENDING LIFTING ON AND OFF HIGHLO
## [12] TEAR OF LEFT MEDICAL MENISCUS
## [13] IW WAS DRIVING WORK VEHICLE ON WORK TIME MAKING A LT TURN WHEN OV RAN RED LIGHT HITTING IV. IW HAS PAIN IN RT SHOULDER AND MIDDLE OF BACK.
## [14] IW WAS DRIVING AND WAS STOPPED AND WAS REAR-ENDED BY ANOTHER VEHICLE. IW HAS A HEADACHE AND NECK PAIN.
## [15] IE WAS HURT IN VEH THAT WAS HIT IN THE REAR
## [16] IE WAS MAKING A DELIVERY. HE WAS ATTEMPTING TO OPEN ROLL UP DOOR ON BOX TRUCK. THE DOOR WAS DIFFICULT TO OPEN. HE CLIMBED UP TO TRUCK & LIFTED AND FELT SNAP IN HIS LOWER BACK.
## [17] IW ALLEGES AN OCCUPATIONAL HEARING LOSS PER AN ATTORNEY NOTICE.
## [18] LOWER BACK STRAIN CAUSED SEVERE PAIN IN BACK AND RIGHT LEG NUMBNESS OCCURRED WHILE UNLOADING MATERIAL FROM THE BACK OF A DELIVERY TRUCK AT CUSTOMER'S LOCATION
## [19] IW WAS CLOSING A ROLL UP DOOR AT THE END OF SHIFT AND PLACED HIS HAND ON THE TRACK AS THE DOOR ROLLED DOWN IT WENT RIGHT OVER HIS LEFT HAND
## [20] IE WAS MAKING A HOLE THROUGH A 2X4 WITH A 3/8 INCH PADDLE BIT, IT JUMPED OFF THE WOOD AND HIT THE EMPLOYEE ON THE L HAND.
# Transformation is performed using tm_map() function to replace, for example, special characters from the text.
# Replacing "/", "@" and "|" with space:
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "/"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "@")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "@"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "\\|"): transformation drops
## documents
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
inspect(docs[1:20])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 20
##
## [1] iw reported walking back fork lift felt pop right knee fell floor
## [2] ie getting forklift foot slipped felt pain lower back
## [3] iw walking around sheeting machine heard pop lft ankle felt pain upper lft leg diag lft ankle strain per dr
## [4] iw handling material confirming order iw turned hit right elbow desk
## [5] iw got fiber glass debris eye
## [6] iw picking sheet foam glass felt pain back
## [7] iw tripped fell ground due banding pallet
## [8] ie suffered seizure collapsing floor
## [9] ie pulling order box fell onto head causing neck back pain
## [10] iw loading insulation bundles truck iw felt pain left thumb left thumb
## [11] constant bending lifting highlo
## [12] tear left medical meniscus
## [13] iw driving work vehicle work time making lt turn ov ran red light hitting iv iw pain rt shoulder middle back
## [14] iw driving stopped rearended another vehicle iw headache neck pain
## [15] ie hurt veh hit rear
## [16] ie making delivery attempting open roll door box truck door difficult open climbed truck lifted felt snap lower back
## [17] iw alleges occupational hearing loss per attorney notice
## [18] lower back strain caused severe pain back right leg numbness occurred unloading material back delivery truck customers location
## [19] iw closing roll door end shift placed hand track door rolled went right left hand
## [20] ie making hole x inch paddle bit jumped wood hit employee l hand
# Build a term-document matrix
# I have divided it into 2 parts because of space issues
dtm1 <- TermDocumentMatrix(docs[1:28402])
m1 <- as.matrix(dtm1)
v1 <- sort(rowSums(m1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
head(d1, 10)
## word freq
## states states 12883
## left left 9602
## right right 8896
## back back 8814
## patient patient 8554
## pain pain 6483
## strain strain 6466
## shoulder shoulder 3577
## injured injured 3542
## resident resident 3463
dtm2 <- TermDocumentMatrix(docs[28403:38403])
m2 <- as.matrix(dtm2)
v2 <- sort(rowSums(m2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
head(d2, 10)
## word freq
## right right 1458
## left left 1332
## strain strain 1260
## struck struck 1208
## back back 1167
## injury injury 1131
## pain pain 1067
## cut cut 1034
## fall fall 962
## provided provided 883
dtm3 <- TermDocumentMatrix(docs[38404:48404])
m3 <- as.matrix(dtm3)
v3 <- sort(rowSums(m3),decreasing=TRUE)
d3 <- data.frame(word = names(v3),freq=v3)
head(d3, 10)
## word freq
## right right 3497
## left left 3201
## pain pain 2451
## back back 2336
## felt felt 1878
## employee employee 1859
## associate associate 1689
## fell fell 1480
## hand hand 1418
## floor floor 1251
dtm4 <- TermDocumentMatrix(docs[48405:56803])
m4 <- as.matrix(dtm4)
v4 <- sort(rowSums(m4),decreasing=TRUE)
d4 <- data.frame(word = names(v4),freq=v4)
head(d4, 10)
## word freq
## left left 1487
## strain strain 1374
## right right 1235
## back back 1013
## fall fall 789
## shoulder shoulder 786
## struck struck 739
## pain pain 722
## knee knee 671
## hand hand 657
d1$rn <- rownames(d1)
d2$rn <- rownames(d2)
d3$rn <- rownames(d3)
d4$rn <- rownames(d4)
# bind the two dataframes together by row and aggregate
res <- aggregate(cbind(freq) ~ rn, rbind(d1,d2,d3,d4), sum)
# assign the rownames again
rownames(res) <- res$rn
set.seed(1234)
wordcloud(words = res$rn, freq = res$freq, min.freq = 10,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# I have merged all the four data frames adding the frequency of the words and created 1 word cloud.
# Alternatively we can also do the analysis on the basis of 4 separated word clouds.
set.seed(1234)
wordcloud(words = d4$word, freq = d4$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
set.seed(1234)
wordcloud(words = d1$word, freq = d1$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
set.seed(1234)
wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : employee could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : accident could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : slipmiscellaneous could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : diagnosed could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : repetitive could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : ground could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : plane could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : experienced could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : otherwise could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : forklift could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : started could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : unloading could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : reported could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : index could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : grease could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : gate could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : levelunknown could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : miscellunknown could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : causestype could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : sunknown could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : inhalation could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : trying could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : absorption could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : warehouse could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : sprained could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : burns could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : caused could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : motion could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : unknown could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : reaching could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : foreign could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : chest could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : stacking could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : kitchen could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : removing could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : holding could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : came could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
## max.words = 200, : swelling could not be fit on page. It will not be
## plotted.
set.seed(1234)
wordcloud(words = d3$word, freq = d3$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
findFreqTerms(dtm1, lowfreq = 40)
## [1] "back" "fell" "felt" "floor"
## [5] "knee" "lift" "pop" "reported"
## [9] "right" "walking" "foot" "forklift"
## [13] "getting" "lower" "pain" "slipped"
## [17] "ankle" "around" "heard" "leg"
## [21] "machine" "per" "strain" "upper"
## [25] "desk" "elbow" "handling" "hit"
## [29] "turned" "eye" "glass" "got"
## [33] "picking" "sheet" "due" "ground"
## [37] "pallet" "tripped" "seizure" "box"
## [41] "causing" "head" "neck" "onto"
## [45] "pulling" "left" "loading" "thumb"
## [49] "truck" "bending" "lifting" "medical"
## [53] "meniscus" "tear" "driving" "hitting"
## [57] "light" "making" "middle" "ran"
## [61] "red" "shoulder" "time" "turn"
## [65] "vehicle" "work" "another" "headache"
## [69] "rearended" "stopped" "hurt" "rear"
## [73] "attempting" "door" "lifted" "open"
## [77] "roll" "snap" "alleges" "loss"
## [81] "caused" "numbness" "severe" "unloading"
## [85] "closing" "end" "hand" "placed"
## [89] "rolled" "shift" "went" "bit"
## [93] "employee" "hole" "inch" "jumped"
## [97] "checking" "foreign" "injured" "object"
## [101] "standing" "broke" "piece" "placing"
## [105] "pot" "bump" "bumped" "contusion"
## [109] "slight" "discomfort" "job" "performing"
## [113] "caught" "stepped" "taking" "behind"
## [117] "cut" "pushing" "arm" "inside"
## [121] "move" "pressure" "push" "put"
## [125] "strained" "starting" "stepping" "side"
## [129] "driver" "ended" "twisted" "hands"
## [133] "holding" "using" "restroom" "step"
## [137] "way" "falling" "keep" "next"
## [141] "tried" "use" "weight" "cleaning"
## [145] "full" "picked" "pull" "three"
## [149] "water" "lbs" "sharp" "boxes"
## [153] "working" "exposed" "grab" "nail"
## [157] "palm" "punctured" "finger" "guard"
## [161] "moving" "rail" "resulted" "jammed"
## [165] "one" "wheel" "hernia" "bend"
## [169] "car" "cart" "heavy" "throwing"
## [173] "trash" "wrist" "area" "cord"
## [177] "cutting" "fall" "metal" "see"
## [181] "towards" "dust" "eyes" "came"
## [185] "close" "edge" "station" "struck"
## [189] "blade" "coming" "stool" "upon"
## [193] "fingers" "reached" "saw" "tip"
## [197] "abdomen" "paper" "table" "lot"
## [201] "parking" "chair" "sit" "agents"
## [205] "causes" "disorders" "miscellaneous" "physical"
## [209] "flying" "twisting" "handled" "others"
## [213] "stationary" "striking" "slip" "different"
## [217] "level" "stairs" "animal" "insect"
## [221] "grease" "liquid" "spills" "aircraft"
## [225] "injury" "body" "motor" "burn"
## [229] "scald" "puncture" "scrape" "collision"
## [233] "tool" "bodily" "exposure" "fluid"
## [237] "rubbed" "mechanical" "disorder" "motion"
## [241] "repetitive" "stress" "injuries" "cumulative"
## [245] "fluids" "hot" "steam" "carrying"
## [249] "chemical" "worker" "reaching" "contact"
## [253] "ladder" "person" "resident" "electric"
## [257] "shock" "fumes" "accident" "auto"
## [261] "resulting" "noc" "opening" "ice"
## [265] "sidewalk" "parts" "broken" "building"
## [269] "elevator" "unknown" "fire" "mold"
## [273] "strike" "trip" "trauma" "bitten"
## [277] "absorption" "ingestion" "inhalation" "scraped"
## [281] "needle" "stick" "snow" "reaction"
## [285] "cause" "patient" "air" "steps"
## [289] "twist" "going" "grabbed" "strains"
## [293] "bilateral" "knees" "fracture" "cuff"
## [297] "possible" "rotator" "groin" "climbing"
## [301] "hard" "sprain" "changing" "chest"
## [305] "mva" "container" "handle" "low"
## [309] "pulled" "shin" "site" "sprayed"
## [313] "balance" "lost" "shoulders" "stuck"
## [317] "trying" "frame" "stomach" "cap"
## [321] "popped" "laceration" "cabinet" "jerked"
## [325] "grabbing" "hip" "rib" "ees"
## [329] "hose" "ring" "contusions" "wall"
## [333] "exiting" "turning" "arms" "numb"
## [337] "front" "slammed" "sore" "tingling"
## [341] "top" "near" "opened" "still"
## [345] "bar" "index" "thigh" "wash"
## [349] "dolly" "legs" "pains" "splashed"
## [353] "glove" "tank" "bent" "dropped"
## [357] "part" "transferring" "swollen" "face"
## [361] "fainted" "stood" "smashed" "ribs"
## [365] "burned" "line" "loose" "tubing"
## [369] "removed" "infection" "office" "tailbone"
## [373] "ear" "passed" "buckled" "forearm"
## [377] "allergic" "itching" "redness" "bottom"
## [381] "skin" "spilled" "window" "nose"
## [385] "found" "someone" "landing" "became"
## [389] "confused" "feel" "started" "wet"
## [393] "gas" "feeling" "cargo" "missed"
## [397] "heel" "concussion" "involved" "personal"
## [401] "bumps" "passenger" "locked" "unit"
## [405] "backwards" "away" "putting" "black"
## [409] "pushed" "seat" "burning" "self"
## [413] "multiple" "gave" "inflammation" "pinky"
## [417] "flew" "landed" "remove" "calf"
## [421] "bag" "abdominal" "hanging" "supplies"
## [425] "razor" "lid" "removing" "break"
## [429] "forward" "outside" "uneven" "two"
## [433] "across" "take" "garbage" "large"
## [437] "tube" "dermatitis" "something" "forehead"
## [441] "curb" "bed" "bites" "intersection"
## [445] "stop" "anxiety" "buttocks" "tooth"
## [449] "leaning" "bathroom" "rash" "catch"
## [453] "lock" "get" "change" "coworker"
## [457] "knife" "plastic" "food" "toe"
## [461] "chart" "pointer" "hallway" "computer"
## [465] "corner" "diagnosed" "experiencing" "since"
## [469] "gate" "lacerated" "finished" "held"
## [473] "high" "reports" "place" "incident"
## [477] "received" "several" "developed" "warehouse"
## [481] "feels" "raised" "facility" "sustained"
## [485] "coffee" "lumbar" "pole" "small"
## [489] "strap" "shelf" "new" "sliding"
## [493] "duties" "lateral" "moved" "client"
## [497] "sitting" "str" "pricked" "ramp"
## [501] "walked" "inner" "bruise" "board"
## [505] "set" "cover" "room" "drawer"
## [509] "day" "experienced" "specific" "past"
## [513] "weeks" "gloves" "used" "treatment"
## [517] "supervisor" "closed" "employees" "swelling"
## [521] "stated" "sensation" "soreness" "abrasion"
## [525] "scale" "equipment" "plate" "post"
## [529] "slid" "itchy" "said" "stand"
## [533] "claim" "old" "states" "claims"
## [537] "morning" "muscle" "stacking" "woke"
## [541] "also" "worse" "walk" "sign"
## [545] "process" "reach" "pick" "stating"
## [549] "position" "approx" "feet" "long"
## [553] "aide" "test" "claimant" "dumpster"
## [557] "non" "underneath" "months" "needlestick"
## [561] "closet" "linen" "flushing" "gtube"
## [565] "spider" "can" "nurse" "feeding"
## [569] "assisting" "cervical" "asked" "help"
## [573] "unable" "denied" "residents" "toilet"
## [577] "wheelchair" "shower" "phone" "hurting"
## [581] "patients" "bleeding" "pts" "wound"
## [585] "reposition" "butterfly" "safety" "pan"
## [589] "abg" "giving" "sling" "transfer"
## [593] "cna" "rolling" "treating" "areas"
## [597] "assist" "family" "health" "home"
## [601] "like" "symptoms" "bbp" "bedside"
## [605] "catheter" "running" "blood" "drawing"
## [609] "days" "helping" "member" "staff"
## [613] "unsure" "care" "radiating" "scratched"
## [617] "code" "emp" "port" "clean"
## [621] "injection" "med" "helped" "thoracic"
## [625] "breaking" "give" "walker" "belt"
## [629] "gait" "insulin" "slide" "kicked"
## [633] "began" "dining" "ago" "state"
## [637] "thought" "disconnected" "tangled" "ambulance"
## [641] "called" "without" "bariatric" "quickly"
## [645] "sat" "suddenly" "call" "possibly"
## [649] "nurses" "continued" "pants" "took"
## [653] "tray" "talking" "resisted" "commode"
## [657] "dishes" "spr" "bruised" "transfering"
## [661] "boosting" "moderate" "weak" "lancet"
## [665] "sharps" "bedpan" "device" "dog"
## [669] "entering" "nursing" "throbbing" "toward"
## [673] "sprained" "shoes" "aggressive" "draw"
## [677] "mid" "therapist" "transferred" "icu"
## [681] "bear" "punched" "alleged" "second"
## [685] "limp" "control" "oven" "alarm"
## [689] "last" "seen" "urine" "hospital"
## [693] "noticed" "wheels" "mat" "lowered"
## [697] "attempted" "mattress" "syringe" "backward"
## [701] "let" "rest" "sudden" "hall"
## [705] "medication" "check" "washing" "now"
## [709] "report" "thinks" "times" "crack"
## [713] "repositioning" "providing" "meds" "splash"
## [717] "swung" "cup" "kitchen" "lunch"
## [721] "night" "prevent" "injuring" "hoyer"
## [725] "flushed" "foley" "positive" "scabies"
## [729] "wrists" "recliner" "dressing" "manager"
## [733] "wrong" "assistance" "shut" "adjusting"
## [737] "reporting" "bruising" "noted" "therapy"
## [741] "little" "related" "bite" "people"
## [745] "just" "dirty" "combative" "sticking"
## [749] "toes" "big" "happened" "make"
## [753] "medications" "painful" "mouth" "made"
## [757] "medicine" "positioning" "later" "administering"
## [761] "contaminated" "shooting" "leaving" "drew"
## [765] "trach" "first" "twinge" "hurts"
## [769] "told" "house" "alert" "done"
## [773] "may" "denies" "source" "assisted"
## [777] "res" "obese" "bars" "parallel"
## [781] "gym" "ready" "pay" "jaw"
## [785] "shoe" "laundry" "coughed" "entire"
## [789] "clients" "caring" "sputum" "vent"
## [793] "chicken" "leaned" "hold" "bathing"
## [797] "able" "buttock" "cheek" "calling"
## [801] "hours" "dizzy" "deep" "emptying"
## [805] "accidently" "agitated" "threw" "scratch"
## [809] "movement" "gown" "applied" "week"
## [813] "spine" "accidentally" "mopped" "doctor"
## [817] "today" "twice" "tow" "already"
## [821] "lumbosacral" "bone" "spasms" "pad"
## [825] "whole" "bloody" "yesterday" "great"
## [829] "bags" "coworkers" "region" "aid"
## [833] "pen" "washed" "secretions" "know"
## [837] "heparin" "shot" "soap" "acute"
## [841] "stumbled" "breast" "squeezed" "bath"
## [845] "backed" "entered" "almost" "disposing"
## [849] "needed" "fmc" "van" "lip"
## [853] "difficulty" "given" "treated" "minutes"
## [857] "poked" "sugar" "sent" "fnol"
## [861] "liens" "passing" "oxygen" "base"
## [865] "sink" "limping" "currently" "oacr"
## [869] "cat" "range" "contents" "flight"
## [873] "hrs" "trays" "cleaned" "human"
## [877] "taken" "via" "ems" "sought"
## [881] "clinic" "limited" "spit" "caller"
## [885] "mild" "visit" "size" "unk"
## [889] "bled" "squirted" "driveway" "tug"
## [893] "unspecified" "radiates" "sup" "sxa"
## [897] "pit" "sxxa" "wxxa" "luggage"
## [901] "baggage" "loader" "compartment"
# We can also find association of terms on the basis of correlation
findAssocs(dtm1, terms = "pain", corlimit = 0.3)
## $pain
## felt back
## 0.36 0.30
# One analysis that we can do here is that, on the basis of which body part the case goes to litigtion
# For now I will just create some new variables
objective3$hasStrain <- str_detect(objective3$Cause.Description,pattern = "strain|Strain|STRAIN")
objective3$hasPain <- str_detect(objective3$Cause.Description,pattern = "pain|Pain|PAIN")
objective3$hasShoulderInjury <- str_detect(objective3$Cause.Description,pattern = "(shoulder|Shoulder|SHOULDER)&(injured|Injured|INJURED|Injury|injury|INJURY)")
objective3$hasKneeProblem <- str_detect(objective3$Cause.Description,pattern = "knee|Knee|KNEE")
objective3$hasWristProblem <- str_detect(objective3$Cause.Description,pattern = "wrist|Wrist|WRIST")
objective3$hasSprain <- str_detect(objective3$Cause.Description,pattern = "sprain|Sprain|SPRAIN")
objective3$hasSlipped <- str_detect(objective3$Cause.Description,pattern = "slipped|Slipped|SLIPPED|slip|Slip|SLIP")
head(objective3)
## Cause.Description
## 1 THE IW REPORTED HE WAS WALKING BACK TO FORK LIFT AND FELT A POP IN RIGHT KNEE AND FELL TO FLOOR.
## 2 IE WAS GETTING OFF A FORKLIFT, FOOT SLIPPED AND HE FELT PAIN IN LOWER BACK
## 3 IW WAS WALKING AROUND A SHEETING MACHINE, WHEN SHE HEARD A POP IN LFT ANKLE AND FELT PAIN IN THE UPPER LFT LEG, DIAG: LFT ANKLE STRAIN PER DR.
## 4 IW WAS HANDLING MATERIAL, CONFIRMING ORDER, IW TURNED & HIT HIS RIGHT ELBOW ON A DESK
## 5 IW GOT FIBER GLASS DEBRIS IN EYE
## 6 THE IW WAS PICKING UP A SHEET OF FOAM GLASS WHEN HE FELT A PAIN IN HIS BACK.
## High.Cost hasStrain hasPain hasShoulderInjury hasKneeProblem
## 1 0 FALSE FALSE FALSE TRUE
## 2 1 FALSE TRUE FALSE FALSE
## 3 0 TRUE TRUE FALSE FALSE
## 4 0 FALSE FALSE FALSE FALSE
## 5 0 FALSE FALSE FALSE FALSE
## 6 0 FALSE TRUE FALSE FALSE
## hasWristProblem hasSprain hasSlipped
## 1 FALSE FALSE FALSE
## 2 FALSE FALSE TRUE
## 3 FALSE FALSE FALSE
## 4 FALSE FALSE FALSE
## 5 FALSE FALSE FALSE
## 6 FALSE FALSE FALSE
# More fields can be created by checking the associations and correlations of words